Tumor evolution project

Data used

In this notebook, we are using the pbta.tsv file geenrated by the sample-distribution-analysis module.

Set up

suppressPackageStartupMessages({
  library(tidyverse)
  library(rstatix)
  library(ggpubr)
})

Directories and File Inputs/Outputs

# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions 
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal")
input_dir <- file.path(analysis_dir, "input")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")

# Input files
pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
tmb_file <- file.path(input_dir, "snv-mutation-tmb-coding.tsv")
palette_file <- file.path(root_dir, "figures", "palettes", "tumor_descriptor_color_palette.tsv")

# File path to plot directory
plots_dir <-
  file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}

source(paste0(root_dir, "/figures/scripts/theme.R"))

Read in data and process

# Make this reproducible
set.seed(2023)

# Read in tmb file and remove hypermutant samples
tmb <- readr::read_tsv(tmb_file, guess_max = 100000, show_col_types = FALSE) %>%
  filter(!tmb >= 10) %>% 
  dplyr::rename(Kids_First_Biospecimen_ID = Tumor_Sample_Barcode) # change name of the biospecimen to match the one from the histologies files

# Read in pbta file and create df
df <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>%
  filter(!(experimental_strategy == "RNA-Seq"),
         !(tumor_descriptor == "Unavailable")) %>% # these are 2 samples
  left_join(tmb, by = c("Kids_First_Biospecimen_ID", "experimental_strategy")) %>%
  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, tumor_descriptor,
         broad_histology, match_id_DNA, experimental_strategy, 
         cancer_group, short_histology, tmb, mutation_count, DNA) %>%
  dplyr:::mutate(short_histology_sum = ifelse(short_histology == "HGAT", "HGG",
                                              ifelse(short_histology == "LGAT", "LGG", 
                                                     ifelse(short_histology == "Ependymoma", "Ependymoma", 
                                                            ifelse(short_histology == "Medulloblastoma", "Medulloblastoma", 
                                                                   ifelse(short_histology == "ATRT", "ATRT", "Other"))))),
                 log10_tmb = abs(log10(tmb)),
                 tumor_descriptor = factor(tumor_descriptor),
                 tumor_descriptor = fct_relevel(tumor_descriptor, c("Diagnosis", "Progressive", "Recurrence", "Deceased", "Second Malignancy", "Unavailable"))) %>% 
  distinct(match_id_DNA, .keep_all = TRUE) %>% # keep only one bs_sample per genomic assay (WGS/WXS)
  filter(!is.na(tmb))
Warning: There was 1 warning in `dplyr:::mutate()`.
ℹ In argument: `tumor_descriptor = fct_relevel(...)`.
Caused by warning:
! 1 unknown level in `f`: Unavailable
# How many samples per Timepoint?
print(df %>% count(tumor_descriptor))

# How many samples per cancer type?
print(df %>% count(short_histology_sum))

# How many samples per Timepoint and cancer type?
rename(count(df, tumor_descriptor, short_histology_sum), Freq = n)

# How many cases with unpaired longitudinal samples?
no_samples <- length(unique(df$Kids_First_Participant_ID))

# Let's count and remove any timepoints with less than 2 samples therein
timepoint_n_df <- df %>% 
  count(tumor_descriptor) %>% 
  dplyr::mutate(tumor_descriptor_n = glue::glue("{tumor_descriptor} (N={n})")) %>% 
  dplyr::rename(timepoint_n = n)

df_plot <- df %>% 
  left_join(timepoint_n_df, by = c("tumor_descriptor")) %>%
  filter(!timepoint_n <= 2) # remove if total number of values per group is less than 2

How many patient samples in the unpaired longitudinal data?

There are 1924 samples in total.

All unpaired samples

We are interested in how TMB changes over the time in all PBTA samples (unpaired samples).

# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE) %>% 
  mutate(tumor_descriptor = color_names)

# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$tumor_descriptor

# Define label for plots
Timepoint <- df_plot$tumor_descriptor

# Define ylim
ylim <- max(df_plot$tmb)
print(df_plot %>% 
        ggplot2::ggplot() + 
        ggplot2::aes(x = tumor_descriptor, 
                     y = tmb,
                     color = Timepoint) + 
        ggplot2::geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) + 
        # light guiding line representing 0 exposure
        ggplot2::geom_hline(yintercept = 0, size = 0.15) + 
        ggplot2::scale_color_manual(values = palette, 
                                    breaks = sort(names(palette))) +
        ggplot2::stat_summary(color = "black", size = 0.3) + 
        ggplot2::labs(x = "Tumor descriptor",
                      y = "TMB") +
        theme_Publication() +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90)))


# Save the plot
ggsave(filename = "TMB-unpaired-longitudinal-samples.pdf",
       path = plots_dir, 
       width = 10, 
       height = 5, 
       device = "pdf", 
       useDingbats = FALSE)

All unpaired samples across cancer types

We will use the short_histology column to display TMB differences across cancer types.

print(df_plot %>%
        ggplot2::ggplot() + 
        ggplot2::aes(x = tumor_descriptor, 
                     y = tmb,
                     color = Timepoint) + 
        ggplot2::geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) + 
        # light guiding line representing 0 exposure
        ggplot2::geom_hline(yintercept = 0, size = 0.15) + 
        ggplot2::scale_color_manual(values = palette, 
                                    breaks = sort(names(palette))) +
        ggplot2::stat_summary(color = "black", size = 0.3) + 
        ggplot2::facet_wrap(~short_histology, nrow = 8) +
        ggplot2::labs(x = "Tumor descriptor",
                      y = "TMB") +
        theme_Publication() + 
        theme(axis.text.x = element_text(angle = 90)) +
        scale_y_continuous(limits = c(0, ylim)))


# Save the plot
ggsave(filename = "TMB-unpaired-longitudinal-samples-cancer-type.pdf", 
       path = plots_dir, 
       width = 25, 
       height = 20, 
       device = "pdf", 
       useDingbats = FALSE)

We observe that majority of samples are High- and Low-grade gliomas as well as Ependymoma, so we will classify accordingly (versus any other cancer types).

All unpaired samples across cancer types summarized

# Let's count and remove any cancer types and timepoints with less than 2 samples therein
tumor_descriptor_cancer_n_df <- df_plot %>% 
  count(short_histology_sum, tumor_descriptor) %>% 
  dplyr::mutate(tumor_descriptor_cancer_n = glue::glue("{short_histology_sum}_{tumor_descriptor}  (N={n})")) %>% 
  dplyr::rename(histology_n = n)

df_plot_cancer <- df_plot %>% 
  left_join(tumor_descriptor_cancer_n_df, by = c("tumor_descriptor", "short_histology_sum")) %>%
  filter(!histology_n <= 2) # remove if total number of values per group is less than 2

# Define label for plots
Timepoint_cancer <- df_plot_cancer$tumor_descriptor

# Define ylim
ylim_cancer <- max(df_plot_cancer$tmb)

# Plot the plot!
print(df_plot_cancer %>% 
        ggplot2::ggplot() + 
        ggplot2::aes(x = tumor_descriptor, 
                     y = tmb,
                     color = Timepoint_cancer) + 
        ggplot2::geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) + 
        # light guiding line representing 0 exposure
        ggplot2::geom_hline(yintercept = 0, size = 0.15) + 
        ggplot2::scale_color_manual(values = palette, 
                                    breaks = sort(names(palette))) +
        ggplot2::stat_summary(color = "black", size = 0.3) + 
        ggplot2::facet_wrap(~short_histology_sum, nrow = 1) +
        ggplot2::labs(x = "Tumor descriptor",
                      y = "TMB") +
        theme_Publication() + 
        theme(axis.text.x = element_text(angle = 90)) +
        scale_y_continuous(limits = c(0, ylim_cancer)))


# Save the plot
ggsave(filename = "TMB-unpaired-longitudinal-samples-cancer-type-sum.pdf", 
       path = plots_dir, 
       width = 12, 
       height = 6, 
       device = "pdf", 
       useDingbats = FALSE)


# Plot by using tumor_descriptor_cancer_n
# Let's order first x axis 
#TODO: To make the f reproducible
f = c("ATRT_Diagnosis  (N=48)", "ATRT_Progressive  (N=10)", "ATRT_Recurrence  (N=3)", "ATRT_Deceased  (N=4)",
  "Ependymoma_Diagnosis  (N=113)", "Ependymoma_Progressive  (N=20)" , "Ependymoma_Recurrence  (N=24)", "Ependymoma_Deceased  (N=7)",
      "HGG_Diagnosis  (N=235)", "HGG_Progressive  (N=22)", "HGG_Recurrence  (N=15)", "HGG_Deceased  (N=72)", "HGG_Second Malignancy  (N=5)",
      "LGG_Diagnosis  (N=405)" , "LGG_Progressive  (N=61)", "LGG_Recurrence  (N=25)", "LGG_Deceased  (N=7)", "LGG_Second Malignancy  (N=4)" ,
      "Medulloblastoma_Diagnosis  (N=217)", "Medulloblastoma_Progressive  (N=6)", "Medulloblastoma_Recurrence  (N=6)" , "Medulloblastoma_Deceased  (N=6)",
      "Other_Diagnosis  (N=538)", "Other_Progressive  (N=89)", "Other_Recurrence  (N=80)", "Other_Deceased  (N=8)")

df_plot_cancer <- df_plot_cancer %>% 
  mutate(tumor_descriptor_cancer_n = factor(tumor_descriptor_cancer_n),
                 tumor_descriptor_cancer_n = fct_relevel(tumor_descriptor_cancer_n, f))
                                                         
# Plot the plot!
print(df_plot_cancer %>% 
        ggplot2::ggplot() + 
        ggplot2::aes(x = tumor_descriptor_cancer_n, 
                     y = tmb,
                     color = Timepoint_cancer) + 
        ggplot2::geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) + 
        # light guiding line representing 0 exposure
        ggplot2::geom_hline(yintercept = 0, size = 0.15) + 
        ggplot2::scale_color_manual(values = palette, 
                                    breaks = sort(names(palette))) +
        ggplot2::stat_summary(color = "black", size = 0.3) + 
        ggplot2::labs(x = "Tumor descriptor",
                      y = "TMB") +
        theme_Publication() + 
        theme(axis.text.x = element_text(angle = 90)) +
        scale_y_continuous(limits = c(0, ylim_cancer)))


# Save the plot
ggsave(filename = "TMB-unpaired-longitudinal-samples-cancer-type-sum-n.pdf", 
       path = plots_dir, 
       width = 12, 
       height = 6, 
       device = "pdf", 
       useDingbats = FALSE)

#TODO: To choose one of the plots in the code chunk `plot-cancer-group-sum`

Summary statistics for Timepoints and cancer types

Considering the inequalities in the sampling effort for each timepoint, let’s explore whether the sampling size affects the observed patterns, e.g., indication of Progressive and Recurrence having higer TMB compared to Diagnosis for High-grade glioma patients.

# Inspect some random rows of the data by groups
set.seed(2023)
df_plot_cancer %>% 
  select(short_histology_sum, tumor_descriptor, tmb, log10_tmb) %>% 
  sample_n_by(short_histology_sum, tumor_descriptor, tmb, size = 1)

# Compute some summary statistics (mean and sd) by groups:
stat.summary <- df_plot_cancer %>%
  group_by(short_histology_sum, tumor_descriptor) %>%
  get_summary_stats(tmb, type = "mean_sd")
stat.summary

# Pairwise comparisons between time points at each group levels
# Paired t-test is used because we have repeated measures by time
stat.test <- df_plot_cancer %>%
  group_by(short_histology_sum) %>%
  pairwise_t_test(log10_tmb ~ tumor_descriptor, 
                  paired = FALSE, 
                  p.adjust.method = "bonferroni")
stat.test

# Add statistical test p-values
stat.test <- stat.test %>% add_xy_position(x = "tumor_descriptor")

# Create bxp
print(ggboxplot(df_plot_cancer,
                x = "tumor_descriptor",
                y = "log10_tmb",
                color = "tumor_descriptor",
                palette = palette) +
        facet_wrap(~short_histology_sum, nrow = 1) +
        theme_Publication() + 
        theme(axis.text.x = element_text(angle = 90)) +
        stat_pvalue_manual(stat.test, label = "p.adj.signif",
                           step.increase = 0.08, hide.ns = TRUE, tip.length = 0))


# Save the plot
ggsave(filename = "TMB-Bxp-stat-test.pdf", 
       path = plots_dir, 
       width = 12, 
       height = 6, 
       device = "pdf", 
       useDingbats = FALSE)


# Create jitter plot
print(df_plot_cancer %>% 
        ggplot2::ggplot() + 
        ggplot2::aes(x = tumor_descriptor, 
                     y = log10_tmb,
                     color = Timepoint_cancer) + 
        ggplot2::geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) + 
        # light guiding line representing 0 exposure
        ggplot2::geom_hline(yintercept = 0, size = 0.15) +
        ggplot2::scale_color_manual(values = palette, 
                                    breaks = sort(names(palette))) +
        facet_wrap(~short_histology_sum, nrow = 1) +
        theme_Publication() + 
        theme(axis.text.x = element_text(angle = 90)) +
        stat_pvalue_manual(stat.test, label = "p.adj.signif",
                           step.increase = 0.08, hide.ns = TRUE, tip.length = 0))


# Save the plot
ggsave(filename = "TMB-jitter-stat-test.pdf", 
       path = plots_dir, 
       width = 12, 
       height = 6, 
       device = "pdf", 
       useDingbats = FALSE)

# This code was inspired by the following:
# HOW TO PERFORM MULTIPLE PAIRED T-TESTS IN R
# https://www.datanovia.com/en/blog/how-to-perform-multiple-paired-t-tests-in-r/

#TODO: To choose one of the plots in this code chunk `summary-statistics`
sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.4.1

Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggpubr_0.6.0    rstatix_0.7.2   ggrepel_0.9.3   ggthemes_4.2.4  lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0   dplyr_1.1.2    
 [9] purrr_1.0.1     readr_2.1.4     tidyr_1.3.0     tibble_3.2.1    ggplot2_3.4.2   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] sass_0.4.7             bit64_4.0.5            vroom_1.6.3            jsonlite_1.8.7         carData_3.0-5         
 [6] R.utils_2.12.2         bslib_0.5.0            stats4_4.2.3           GenomeInfoDbData_1.2.9 yaml_2.3.7            
[11] pillar_1.9.0           backports_1.4.1        glue_1.6.2             digest_0.6.33          GenomicRanges_1.50.2  
[16] XVector_0.38.0         ggsignif_0.6.4         colorspace_2.1-0       htmltools_0.5.5        R.oo_1.25.0           
[21] pkgconfig_2.0.3        broom_1.0.5            zlibbioc_1.44.0        scales_1.2.1           tzdb_0.4.0            
[26] timechange_0.2.0       generics_0.1.3         farver_2.1.1           IRanges_2.32.0         car_3.1-2             
[31] cachem_1.0.8           withr_2.5.0            BiocGenerics_0.44.0    cli_3.6.1              magrittr_2.0.3        
[36] crayon_1.5.2           evaluate_0.21          R.methodsS3_1.8.2      fansi_1.0.4            textshaping_0.3.6     
[41] tools_4.2.3            data.table_1.14.8      hms_1.1.3              lifecycle_1.0.3        S4Vectors_0.36.2      
[46] munsell_0.5.0          compiler_4.2.3         jquerylib_0.1.4        GenomeInfoDb_1.34.9    systemfonts_1.0.4     
[51] rlang_1.1.1            RCurl_1.98-1.12        rstudioapi_0.15.0      bitops_1.0-7           labeling_0.4.2        
[56] rmarkdown_2.23         gtable_0.3.3           abind_1.4-5            R6_2.5.1               knitr_1.43            
[61] fastmap_1.1.1          bit_4.0.5              utf8_1.2.3             rprojroot_2.0.3        ragg_1.2.5            
[66] stringi_1.7.12         parallel_4.2.3         Rcpp_1.0.11            vctrs_0.6.3            tidyselect_1.2.0      
[71] xfun_0.39             
---
title: "TMB of tumors across unpaired longitudinal samples in the PBTA Cohort"
author: 'Antonia Chroni <chronia@chop.edu> and Jo Lynne Rokita <rokita@chop.edu> for D3B'
date: "2023"
output:
  html_notebook:
    toc: TRUE
    toc_float: TRUE
---

#### Tumor evolution project 

### Data used 
In this notebook, we are using the `pbta.tsv` file geenrated by the `sample-distribution-analysis` module.

# Set up
```{r load-library}
suppressPackageStartupMessages({
  library(tidyverse)
  library(rstatix)
  library(ggpubr)
})
```

## Directories and File Inputs/Outputs
```{r set-dir-and-file-names}
# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions 
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal")
input_dir <- file.path(analysis_dir, "input")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")

# Input files
pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
tmb_file <- file.path(input_dir, "snv-mutation-tmb-coding.tsv")
palette_file <- file.path(root_dir, "figures", "palettes", "tumor_descriptor_color_palette.tsv")

# File path to plot directory
plots_dir <-
  file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}

source(paste0(root_dir, "/figures/scripts/theme.R"))
```

## Read in data and process
```{r load-process-inputs}
# Make this reproducible
set.seed(2023)

# Read in tmb file and remove hypermutant samples
tmb <- readr::read_tsv(tmb_file, guess_max = 100000, show_col_types = FALSE) %>%
  filter(!tmb >= 10) %>% 
  dplyr::rename(Kids_First_Biospecimen_ID = Tumor_Sample_Barcode) # change name of the biospecimen to match the one from the histologies files

# Read in pbta file and create df
df <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>%
  filter(!(experimental_strategy == "RNA-Seq"),
         !(tumor_descriptor == "Unavailable")) %>% # these are 2 samples
  left_join(tmb, by = c("Kids_First_Biospecimen_ID", "experimental_strategy")) %>%
  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, tumor_descriptor,
         broad_histology, match_id_DNA, experimental_strategy, 
         cancer_group, short_histology, tmb, mutation_count, DNA) %>%
  dplyr:::mutate(short_histology_sum = ifelse(short_histology == "HGAT", "HGG",
                                              ifelse(short_histology == "LGAT", "LGG", 
                                                     ifelse(short_histology == "Ependymoma", "Ependymoma", 
                                                            ifelse(short_histology == "Medulloblastoma", "Medulloblastoma", 
                                                                   ifelse(short_histology == "ATRT", "ATRT", "Other"))))),
                 log10_tmb = abs(log10(tmb)),
                 tumor_descriptor = factor(tumor_descriptor),
                 tumor_descriptor = fct_relevel(tumor_descriptor, c("Diagnosis", "Progressive", "Recurrence", "Deceased", "Second Malignancy", "Unavailable"))) %>% 
  distinct(match_id_DNA, .keep_all = TRUE) %>% # keep only one bs_sample per genomic assay (WGS/WXS)
  filter(!is.na(tmb))

# How many samples per Timepoint?
print(df %>% count(tumor_descriptor))

# How many samples per cancer type?
print(df %>% count(short_histology_sum))

# How many samples per Timepoint and cancer type?
rename(count(df, tumor_descriptor, short_histology_sum), Freq = n)

# How many cases with unpaired longitudinal samples?
no_samples <- length(unique(df$Kids_First_Participant_ID))

# Let's count and remove any timepoints with less than 2 samples therein
timepoint_n_df <- df %>% 
  count(tumor_descriptor) %>% 
  dplyr::mutate(tumor_descriptor_n = glue::glue("{tumor_descriptor} (N={n})")) %>% 
  dplyr::rename(timepoint_n = n)

df_plot <- df %>% 
  left_join(timepoint_n_df, by = c("tumor_descriptor")) %>%
  filter(!timepoint_n <= 2) # remove if total number of values per group is less than 2
``` 


## How many patient samples in the unpaired longitudinal data?
There are `r no_samples` samples in total.

# All unpaired samples
We are interested in how TMB changes over the time in all PBTA samples (unpaired samples).

```{r define-parameters-for-plots}
# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE) %>% 
  mutate(tumor_descriptor = color_names)

# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$tumor_descriptor

# Define label for plots
Timepoint <- df_plot$tumor_descriptor

# Define ylim
ylim <- max(df_plot$tmb)
```

```{r plot-total, fig.width = 10, fig.height = 5, fig.fullwidth = TRUE}
print(df_plot %>% 
        ggplot2::ggplot() + 
        ggplot2::aes(x = tumor_descriptor, 
                     y = tmb,
                     color = Timepoint) + 
        ggplot2::geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) + 
        # light guiding line representing 0 exposure
        ggplot2::geom_hline(yintercept = 0, size = 0.15) + 
        ggplot2::scale_color_manual(values = palette, 
                                    breaks = sort(names(palette))) +
        ggplot2::stat_summary(color = "black", size = 0.3) + 
        ggplot2::labs(x = "Tumor descriptor",
                      y = "TMB") +
        theme_Publication() +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90)))

# Save the plot
ggsave(filename = "TMB-unpaired-longitudinal-samples.pdf",
       path = plots_dir, 
       width = 10, 
       height = 5, 
       device = "pdf", 
       useDingbats = FALSE)
```


# All unpaired samples across cancer types
We will use the `short_histology` column to display TMB differences across cancer types.

```{r plot-cancer-group, fig.width = 25, fig.height = 20, fig.fullwidth = TRUE}
print(df_plot %>%
        ggplot2::ggplot() + 
        ggplot2::aes(x = tumor_descriptor, 
                     y = tmb,
                     color = Timepoint) + 
        ggplot2::geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) + 
        # light guiding line representing 0 exposure
        ggplot2::geom_hline(yintercept = 0, size = 0.15) + 
        ggplot2::scale_color_manual(values = palette, 
                                    breaks = sort(names(palette))) +
        ggplot2::stat_summary(color = "black", size = 0.3) + 
        ggplot2::facet_wrap(~short_histology, nrow = 8) +
        ggplot2::labs(x = "Tumor descriptor",
                      y = "TMB") +
        theme_Publication() + 
        theme(axis.text.x = element_text(angle = 90)) +
        scale_y_continuous(limits = c(0, ylim)))

# Save the plot
ggsave(filename = "TMB-unpaired-longitudinal-samples-cancer-type.pdf", 
       path = plots_dir, 
       width = 25, 
       height = 20, 
       device = "pdf", 
       useDingbats = FALSE)
```

We observe that majority of samples are High- and Low-grade gliomas as well as Ependymoma, so we will classify accordingly (versus any other cancer types).

# All unpaired samples across cancer types summarized

```{r plot-cancer-group-sum, fig.width = 12, fig.height = 6, fig.fullwidth = TRUE}
# Let's count and remove any cancer types and timepoints with less than 2 samples therein
tumor_descriptor_cancer_n_df <- df_plot %>% 
  count(short_histology_sum, tumor_descriptor) %>% 
  dplyr::mutate(tumor_descriptor_cancer_n = glue::glue("{short_histology_sum}_{tumor_descriptor}  (N={n})")) %>% 
  dplyr::rename(histology_n = n)

df_plot_cancer <- df_plot %>% 
  left_join(tumor_descriptor_cancer_n_df, by = c("tumor_descriptor", "short_histology_sum")) %>%
  filter(!histology_n <= 2) # remove if total number of values per group is less than 2

# Define label for plots
Timepoint_cancer <- df_plot_cancer$tumor_descriptor

# Define ylim
ylim_cancer <- max(df_plot_cancer$tmb)

# Plot the plot!
print(df_plot_cancer %>% 
        ggplot2::ggplot() + 
        ggplot2::aes(x = tumor_descriptor, 
                     y = tmb,
                     color = Timepoint_cancer) + 
        ggplot2::geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) + 
        # light guiding line representing 0 exposure
        ggplot2::geom_hline(yintercept = 0, size = 0.15) + 
        ggplot2::scale_color_manual(values = palette, 
                                    breaks = sort(names(palette))) +
        ggplot2::stat_summary(color = "black", size = 0.3) + 
        ggplot2::facet_wrap(~short_histology_sum, nrow = 1) +
        ggplot2::labs(x = "Tumor descriptor",
                      y = "TMB") +
        theme_Publication() + 
        theme(axis.text.x = element_text(angle = 90)) +
        scale_y_continuous(limits = c(0, ylim_cancer)))

# Save the plot
ggsave(filename = "TMB-unpaired-longitudinal-samples-cancer-type-sum.pdf", 
       path = plots_dir, 
       width = 12, 
       height = 6, 
       device = "pdf", 
       useDingbats = FALSE)


# Plot by using tumor_descriptor_cancer_n
# Let's order first x axis 
#TODO: To make the f reproducible
f = c("ATRT_Diagnosis  (N=48)", "ATRT_Progressive  (N=10)", "ATRT_Recurrence  (N=3)", "ATRT_Deceased  (N=4)",
  "Ependymoma_Diagnosis  (N=113)", "Ependymoma_Progressive  (N=20)" , "Ependymoma_Recurrence  (N=24)", "Ependymoma_Deceased  (N=7)",
      "HGG_Diagnosis  (N=235)", "HGG_Progressive  (N=22)", "HGG_Recurrence  (N=15)", "HGG_Deceased  (N=72)", "HGG_Second Malignancy  (N=5)",
      "LGG_Diagnosis  (N=405)" , "LGG_Progressive  (N=61)", "LGG_Recurrence  (N=25)", "LGG_Deceased  (N=7)", "LGG_Second Malignancy  (N=4)" ,
      "Medulloblastoma_Diagnosis  (N=217)", "Medulloblastoma_Progressive  (N=6)", "Medulloblastoma_Recurrence  (N=6)" , "Medulloblastoma_Deceased  (N=6)",
      "Other_Diagnosis  (N=538)", "Other_Progressive  (N=89)", "Other_Recurrence  (N=80)", "Other_Deceased  (N=8)")

df_plot_cancer <- df_plot_cancer %>% 
  mutate(tumor_descriptor_cancer_n = factor(tumor_descriptor_cancer_n),
                 tumor_descriptor_cancer_n = fct_relevel(tumor_descriptor_cancer_n, f))
                                                         
# Plot the plot!
print(df_plot_cancer %>% 
        ggplot2::ggplot() + 
        ggplot2::aes(x = tumor_descriptor_cancer_n, 
                     y = tmb,
                     color = Timepoint_cancer) + 
        ggplot2::geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) + 
        # light guiding line representing 0 exposure
        ggplot2::geom_hline(yintercept = 0, size = 0.15) + 
        ggplot2::scale_color_manual(values = palette, 
                                    breaks = sort(names(palette))) +
        ggplot2::stat_summary(color = "black", size = 0.3) + 
        ggplot2::labs(x = "Tumor descriptor",
                      y = "TMB") +
        theme_Publication() + 
        theme(axis.text.x = element_text(angle = 90)) +
        scale_y_continuous(limits = c(0, ylim_cancer)))

# Save the plot
ggsave(filename = "TMB-unpaired-longitudinal-samples-cancer-type-sum-n.pdf", 
       path = plots_dir, 
       width = 12, 
       height = 6, 
       device = "pdf", 
       useDingbats = FALSE)

#TODO: To choose one of the plots in the code chunk `plot-cancer-group-sum`
```

# Summary statistics for Timepoints and cancer types
Considering the inequalities in the sampling effort for each timepoint, let's explore whether the sampling size affects the observed patterns, e.g., indication of Progressive and Recurrence having higer TMB compared to Diagnosis for High-grade glioma patients.

```{r summary-statistics, fig.width = 12, fig.height = 6, fig.fullwidth = TRUE}
# Inspect some random rows of the data by groups
set.seed(2023)
df_plot_cancer %>% 
  select(short_histology_sum, tumor_descriptor, tmb, log10_tmb) %>% 
  sample_n_by(short_histology_sum, tumor_descriptor, tmb, size = 1)

# Compute some summary statistics (mean and sd) by groups:
stat.summary <- df_plot_cancer %>%
  group_by(short_histology_sum, tumor_descriptor) %>%
  get_summary_stats(tmb, type = "mean_sd")
stat.summary

# Pairwise comparisons between time points at each group levels
# Paired t-test is used because we have repeated measures by time
stat.test <- df_plot_cancer %>%
  group_by(short_histology_sum) %>%
  pairwise_t_test(log10_tmb ~ tumor_descriptor, 
                  paired = FALSE, 
                  p.adjust.method = "bonferroni")
stat.test

# Add statistical test p-values
stat.test <- stat.test %>% add_xy_position(x = "tumor_descriptor")

# Create bxp
print(ggboxplot(df_plot_cancer,
                x = "tumor_descriptor",
                y = "log10_tmb",
                color = "tumor_descriptor",
                palette = palette) +
        facet_wrap(~short_histology_sum, nrow = 1) +
        theme_Publication() + 
        theme(axis.text.x = element_text(angle = 90)) +
        stat_pvalue_manual(stat.test, label = "p.adj.signif",
                           step.increase = 0.08, hide.ns = TRUE, tip.length = 0))

# Save the plot
ggsave(filename = "TMB-Bxp-stat-test.pdf", 
       path = plots_dir, 
       width = 12, 
       height = 6, 
       device = "pdf", 
       useDingbats = FALSE)


# Create jitter plot
print(df_plot_cancer %>% 
        ggplot2::ggplot() + 
        ggplot2::aes(x = tumor_descriptor, 
                     y = log10_tmb,
                     color = Timepoint_cancer) + 
        ggplot2::geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) + 
        # light guiding line representing 0 exposure
        ggplot2::geom_hline(yintercept = 0, size = 0.15) +
        ggplot2::scale_color_manual(values = palette, 
                                    breaks = sort(names(palette))) +
        facet_wrap(~short_histology_sum, nrow = 1) +
        theme_Publication() + 
        theme(axis.text.x = element_text(angle = 90)) +
        stat_pvalue_manual(stat.test, label = "p.adj.signif",
                           step.increase = 0.08, hide.ns = TRUE, tip.length = 0))

# Save the plot
ggsave(filename = "TMB-jitter-stat-test.pdf", 
       path = plots_dir, 
       width = 12, 
       height = 6, 
       device = "pdf", 
       useDingbats = FALSE)

# This code was inspired by the following:
# HOW TO PERFORM MULTIPLE PAIRED T-TESTS IN R
# https://www.datanovia.com/en/blog/how-to-perform-multiple-paired-t-tests-in-r/

#TODO: To choose one of the plots in this code chunk `summary-statistics`
```

```{r echo=TRUE}
sessionInfo()
```
